Correlation Network Analysis¶
Gene network analysis is a method designed to identify sub-networks (modules) of correlated genes, which are likely to be co-expressed. This can be helpful in identification of sub-networks (modules) of genes that contribute to disease. In this example, we will cover how to create a pairwise correlation matrix of genes, as well as how to associate them with disease.
import pandas as pd
import numpy as np
import scanpy as sc
import anndata
import networkx as nx
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
import json
import scipy
from sklearn.decomposition import PCA
from scipy.stats import pearsonr
from scipy.spatial.distance import pdist, squareform
from statsmodels.stats.multitest import multipletests
import random
#Load in the required data
datExpr = pd.read_csv('/ReCoDE-Gene-Network-Analysis/data/data/Bcell_datExpr_pseudobulk.csv', index_col = 0)
metadata = pd.read_csv('/ReCoDE-Gene-Network-Analysis/data/data/Bcell_metadata_pseudobulk.csv', index_col = 0)
datExpr
| ISG15 | LINC01342 | TTLL10-AS1 | TNFRSF18 | CALML6 | CHD5 | ICMT-DT | MIR34AHG | RBP7 | MTOR-AS1 | ... | FRMPD3 | TSC22D3 | KLHL13 | AKAP14 | RHOXF1-AS1 | TMEM255A | SMIM10L2B-AS1 | IL9R_ENSG00000124334 | DDX3Y | EIF1AY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| donor_id | |||||||||||||||||||||
| CH-20-001 | 6.380902 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 53.239480 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 21.632603 | 17.641195 |
| CH-20-002 | 12.606751 | 2.33599 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 1.089918 | 0.000000 | 1.158743 | 1.173824 | ... | 0.000000 | 112.643970 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 45.432410 | 22.809190 |
| CH-20-004 | 12.302510 | 0.00000 | 0.000000 | 21.512184 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 42.873410 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 15.570595 | 20.173725 |
| CH-20-005 | 18.603716 | 1.16925 | 1.232658 | 4.975880 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 1.112746 | 190.337740 | 0.000000 | 0.0000 | 1.191559 | 0.000000 | 0.000000 | 0.000000 | 6.931139 | 1.071742 |
| CH-21-002 | 13.705297 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 44.942260 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 1.323198 | 0.000000 | 0.000000 |
| CH-21-006 | 4.377715 | 0.00000 | 0.000000 | 23.782143 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 1.023552 | 0.000000 | ... | 0.000000 | 12.741602 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 5.349793 | 12.981407 |
| CH-21-008 | 18.058025 | 0.00000 | 0.000000 | 44.614340 | 0.00000 | 1.201673 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 76.893720 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 1.080360 | 1.188176 | 2.377049 |
| CH-21-013 | 21.395964 | 0.00000 | 0.000000 | 30.426510 | 0.00000 | 0.000000 | 1.235703 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 54.458330 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 1.117969 | 1.236817 | 23.543072 | 53.250420 |
| CH-21-014 | 13.436963 | 0.00000 | 0.000000 | 11.067089 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 32.248600 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 17.709280 | 21.636190 |
| CH-21-017 | 22.916807 | 0.00000 | 0.000000 | 9.076924 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 2.478934 | 0.000000 | ... | 0.000000 | 188.600070 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 49.114600 | 40.454937 |
| CH-21-020 | 197.794700 | 0.00000 | 0.000000 | 122.788270 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 1.047435 | 0.000000 | ... | 0.000000 | 197.616580 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.765914 | 88.202680 | 173.938080 |
| CH-21-021 | 13.898113 | 0.00000 | 0.000000 | 11.169237 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 20.431047 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 11.017612 | 21.158054 |
| CH-21-028 | 7.210576 | 0.00000 | 1.066841 | 1.321003 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 57.428060 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.989963 | 0.000000 |
| CH-21-029 | 9.007506 | 0.00000 | 0.000000 | 1.928462 | 1.21185 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 157.941900 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.745571 | 2.051687 |
| CH-21-031 | 30.211197 | 0.00000 | 0.000000 | 40.325450 | 0.00000 | 0.000000 | 0.000000 | 2.130981 | 0.000000 | 0.000000 | ... | 1.244156 | 12.550498 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.227465 | 0.906813 |
| CH-21-033 | 21.972580 | 0.00000 | 0.000000 | 84.504500 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 152.658000 | 1.167664 | 1.2505 | 0.000000 | 1.199426 | 0.000000 | 0.000000 | 45.661453 | 142.600740 |
| CH-21-034 | 54.934030 | 0.00000 | 0.000000 | 147.552780 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.886594 | 167.975900 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.075679 |
| CH-21-036 | 17.018766 | 0.00000 | 0.000000 | 2.483573 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 88.924920 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 39.051266 | 10.004631 |
| CH-21-037 | 150.473450 | 0.00000 | 0.000000 | 53.255013 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 38.325650 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 33.786545 | 58.778214 |
| CH-21-046 | 9.337872 | 0.00000 | 0.000000 | 28.949800 | 0.00000 | 1.123670 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 28.600826 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.238980 | 12.119887 |
| CH-21-073 | 4.982193 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 40.201653 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 27.179262 | 2.954510 |
| CH-21-074 | 3.954194 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 1.127058 | 0.000000 | 0.000000 | ... | 0.000000 | 18.354240 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.401694 | 2.069999 |
| CH-21-077 | 33.969110 | 0.00000 | 0.000000 | 3.333775 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 1.115637 | 0.000000 | ... | 0.000000 | 161.007570 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.646068 | 0.000000 |
| CH-21-079 | 7.030363 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 40.449474 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 18.332941 | 11.980942 |
24 rows × 1000 columns
metadata
| nCount_RNA | nFeature_RNA | donor_id.1 | MUTATION | percent.mt | scType_celltype | tissue_type | cell_type | tissue | development_stage | male | female | CH | normal | DNMT3A | TET2 | NoMutation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| donor_id | |||||||||||||||||
| CH-20-001 | 2490.0 | 1403 | CH-20-001 | DNMT3A R882C | 6.119578 | Naive B cells | tissue | B cell | blood | 60 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-20-002 | 1192.0 | 629 | CH-20-002 | DNMT3A R729W (4%), DNMT3A R736C (2%) | 3.803975 | Naive B cells | tissue | B cell | blood | 68 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-20-004 | 1833.0 | 985 | CH-20-004 | TET2 R1516X (30%), TET2 Q659X (29%), SRSF2 P95... | 5.335196 | Naive B cells | tissue | B cell | blood | 85 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-20-005 | 1966.0 | 886 | CH-20-005 | TET2 V1900F (2%) | 5.314136 | Naive B cells | tissue | B cell | blood | 58 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
| CH-21-002 | 1912.0 | 938 | CH-21-002 | none | 5.657238 | Naive B cells | tissue | B cell | blood | 48 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-006 | 1356.0 | 709 | CH-21-006 | DNMT3A R882H (13%) | 5.211849 | Naive B cells | tissue | B cell | blood | 67 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
| CH-21-008 | 1117.0 | 575 | CH-21-008 | none | 8.398348 | Naive B cells | tissue | B cell | blood | 70 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-013 | 1321.0 | 816 | CH-21-013 | none | 4.663212 | Naive B cells | tissue | B cell | blood | 73 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| CH-21-014 | 1064.0 | 623 | CH-21-014 | SRSF2 P95R (40%), TET2 L957Ifs*15 (51%) | 4.146577 | Naive B cells | tissue | B cell | blood | 74 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-017 | 1880.0 | 953 | CH-21-017 | DNMT3A R882H (20%), IDH2 R140Q (10%), TP53 R27... | 6.519922 | Naive B cells | tissue | B cell | blood | 65 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-21-020 | 5325.0 | 2286 | CH-21-020 | none | 5.631046 | Naive B cells | tissue | B cell | blood | 61 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| CH-21-021 | 1671.0 | 943 | CH-21-021 | none | 3.214286 | Naive B cells | tissue | B cell | blood | 83 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| CH-21-028 | 1690.0 | 866 | CH-21-028 | none | 6.053894 | Naive B cells | tissue | B cell | blood | 89 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-029 | 2180.0 | 1073 | CH-21-029 | TET2 G68X (2%) | 2.570194 | Naive B cells | tissue | B cell | blood | 83 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
| CH-21-031 | 1592.0 | 887 | CH-21-031 | none | 6.734398 | Naive B cells | tissue | B cell | blood | 78 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-033 | 2219.0 | 1138 | CH-21-033 | TET2 (33%) | 5.670567 | Naive B cells | tissue | B cell | blood | 81 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-034 | 2010.0 | 974 | CH-21-034 | DNMT3A Q816X (8%) | 7.937365 | Naive B cells | tissue | B cell | blood | 39 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
| CH-21-036 | 2686.0 | 1337 | CH-21-036 | DNMT3A splice (7%) | 3.909544 | Naive B cells | tissue | B cell | blood | 91 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-21-037 | 3546.0 | 1645 | CH-21-037 | TET2 (6.2%) | 4.473764 | Naive B cells | tissue | B cell | blood | 71 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-046 | 1918.0 | 907 | CH-21-046 | DNMT3A W305X (24%) | 4.807084 | Naive B cells | tissue | B cell | blood | 80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-21-073 | 2148.0 | 1096 | CH-21-073 | SRSF2 (33%), TET2 Y1245Lfs*22 (27%), TET2 Q742... | 5.174489 | Naive B cells | tissue | B cell | blood | 77 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-074 | 1322.0 | 708 | CH-21-074 | TET2 C1378Y (23%) | 3.328561 | Naive B cells | tissue | B cell | blood | 70 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-077 | 1715.0 | 934 | CH-21-077 | DNMT3A R749C (9.1%) | 6.539510 | Naive B cells | tissue | B cell | blood | 50 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
| CH-21-079 | 1354.0 | 793 | CH-21-079 | DNMT3A M880V (5%) | 6.386293 | Naive B cells | tissue | B cell | blood | 78 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
Correlation is a statistical measure that describes the extent to which two variables change together. It indicates the strength and direction of a linear relationship between two variables. Correlation analysis for coexpression networks is a method used to study the relationships between genes or proteins by analysing their expression levels across various conditions, tissues, or time points. We will be exploring correlation based co-expression networks within this exercise.
Step 1: Calculate Correlation¶
#You can use the corr function to calculate pairwise pearson correlations between the genes.
correlation_matrix = datExpr.corr()
Lets view the correlation matrix. You will see that there is a diagonal of the value 1, due to the calculating the correlation between the same gene.
correlation_matrix
| ISG15 | LINC01342 | TTLL10-AS1 | TNFRSF18 | CALML6 | CHD5 | ICMT-DT | MIR34AHG | RBP7 | MTOR-AS1 | ... | FRMPD3 | TSC22D3 | KLHL13 | AKAP14 | RHOXF1-AS1 | TMEM255A | SMIM10L2B-AS1 | IL9R_ENSG00000124334 | DDX3Y | EIF1AY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ISG15 | 1.000000 | -0.092502 | -0.106168 | 0.645245 | -0.093365 | -0.102921 | -0.080086 | -0.052127 | 0.180437 | -0.076886 | ... | 0.029464 | 0.334351 | -0.034003 | -0.034003 | -0.049428 | -0.034003 | -0.036643 | 0.187654 | 0.602785 | 0.673038 |
| LINC01342 | -0.092502 | 1.000000 | 0.281837 | -0.183851 | -0.059383 | -0.085815 | 0.556591 | -0.081739 | 0.227693 | 0.890407 | ... | 0.186025 | 0.251452 | -0.059383 | -0.059383 | 0.416022 | -0.059383 | -0.059383 | -0.124612 | 0.173334 | -0.071186 |
| TTLL10-AS1 | -0.106168 | 0.281837 | 1.000000 | -0.181582 | -0.062692 | -0.090597 | -0.090459 | -0.086293 | -0.140100 | -0.062692 | ... | 0.383858 | 0.218250 | -0.062692 | -0.062692 | 0.743859 | -0.062692 | -0.062692 | -0.131555 | -0.209975 | -0.179643 |
| TNFRSF18 | 0.645245 | -0.183851 | -0.181582 | 1.000000 | -0.133171 | 0.079331 | -0.082125 | -0.003088 | -0.006450 | -0.143499 | ... | 0.295032 | 0.385228 | 0.309068 | 0.309068 | -0.116850 | 0.309068 | 0.019452 | 0.157654 | 0.316701 | 0.583206 |
| CALML6 | -0.093365 | -0.059383 | -0.062692 | -0.133171 | 1.000000 | -0.062831 | -0.062735 | -0.059846 | -0.097162 | -0.043478 | ... | -0.077984 | 0.248300 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.091237 | -0.159078 | -0.117442 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TMEM255A | -0.034003 | -0.059383 | -0.062692 | 0.309068 | -0.043478 | -0.062831 | -0.062735 | -0.059846 | -0.097162 | -0.043478 | ... | -0.077984 | 0.230799 | 1.000000 | 1.000000 | -0.043478 | 1.000000 | -0.043478 | -0.091237 | 0.256518 | 0.562305 |
| SMIM10L2B-AS1 | -0.036643 | -0.059383 | -0.062692 | 0.019452 | -0.043478 | -0.062831 | 0.737278 | -0.059846 | -0.097162 | -0.043478 | ... | -0.077984 | -0.094460 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | 1.000000 | 0.523393 | 0.037215 | 0.130174 |
| IL9R_ENSG00000124334 | 0.187654 | -0.124612 | -0.131555 | 0.157654 | -0.091237 | 0.269090 | 0.339578 | -0.125584 | -0.073338 | -0.091237 | ... | -0.163645 | -0.006889 | -0.091237 | -0.091237 | -0.091237 | -0.091237 | 0.523393 | 1.000000 | 0.050801 | 0.197187 |
| DDX3Y | 0.602785 | 0.173334 | -0.209975 | 0.316701 | -0.159078 | -0.210786 | 0.200461 | -0.228903 | 0.451454 | 0.254247 | ... | -0.301277 | 0.402058 | 0.256518 | 0.256518 | -0.127493 | 0.256518 | 0.037215 | 0.050801 | 1.000000 | 0.819119 |
| EIF1AY | 0.673038 | -0.071186 | -0.179643 | 0.583206 | -0.117442 | -0.134538 | 0.088271 | -0.166598 | 0.225645 | -0.017051 | ... | -0.219688 | 0.382712 | 0.562305 | 0.562305 | -0.122181 | 0.562305 | 0.130174 | 0.197187 | 0.819119 | 1.000000 |
1000 rows × 1000 columns
Step 2: Calculate Distance Matrix¶
Now that we have the correlation matrix, we need to calculate the distance matrix. A distance matrix is a mathematical representation that captures the pairwise distances between a set of objects. In hierarchical clustering, distance matrices are used to determine which objects to merge or split based on their pairwise distances. So, we will be using the distance matrix in order to calculate clusters between genes, which will form our networks. These are commonly also called communities.
# Perform hierarchical clustering using the dissimilarity values
distance_matrix = 1 - correlation_matrix # Convert correlation to distance
#The linkage function is used to perform hierarchical clustering on the distance matrix.
linkage_matrix = sch.linkage(distance_matrix, method='average')
# Plot the dendrogram
plt.figure(figsize=(10, 6))
dendrogram = sch.dendrogram(linkage_matrix)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()
Step 3: Threshold Correlations¶
Due to all of the vast correlations, the networks will be very messy each having lots of edges with varying correlation strengths. Not all of these are of interest, such as weak correlations. So we would solely like to focus on the strong correlations.
# Define threshold for significant edges (you can adjust this based on your requirement)
threshold = 1
# Threshold the correlation matrix to determine significant edges
significant_edges = correlation_matrix > threshold
Lets have a look at the significant_edges dataframe. You can now see that this produces a boolean dataframe which states whether the values are greater than the threshold or not.
significant_edges
| ISG15 | LINC01342 | TTLL10-AS1 | TNFRSF18 | CALML6 | CHD5 | ICMT-DT | MIR34AHG | RBP7 | MTOR-AS1 | ... | FRMPD3 | TSC22D3 | KLHL13 | AKAP14 | RHOXF1-AS1 | TMEM255A | SMIM10L2B-AS1 | IL9R_ENSG00000124334 | DDX3Y | EIF1AY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ISG15 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| LINC01342 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| TTLL10-AS1 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| TNFRSF18 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| CALML6 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TMEM255A | False | False | False | False | False | False | False | False | False | False | ... | False | False | True | False | False | False | False | False | False | False |
| SMIM10L2B-AS1 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| IL9R_ENSG00000124334 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| DDX3Y | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| EIF1AY | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
1000 rows × 1000 columns
Step 4: Construct Network¶
networkx is a python library designed for network analysis. Construct the network from significant edges.
G = nx.Graph()
# Loop through significant edges and add edges to the graph
for i in range(significant_edges.shape[0]):
for j in range(significant_edges.shape[1]):
if significant_edges.values[i, j]:
gene1 = significant_edges.index[i]
gene2 = significant_edges.columns[j]
# Find the position of gene1 and gene2 in the correlation matrix
i_corr = np.where(correlation_matrix.index == gene1)[0][0]
j_corr = np.where(correlation_matrix.columns == gene2)[0][0]
# Add the edge to the graph
G.add_edge(gene1, gene2, weight=correlation_matrix.iloc[i_corr, j_corr])
# Display the number of nodes and edges in the graph
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())
Number of nodes: 270 Number of edges: 1291
Step 5: Network Analysis¶
Analyse the network: For example, you can identify clusters/modules using community detection algorithms
communities = nx.algorithms.community.greedy_modularity_communities(G)
communities
[frozenset({'AKAP14',
'ANKRD29',
'ANO1',
'ARHGAP23',
'ARHGEF39',
'C21orf62',
'CC2D2B',
'CCNA1',
'CDH6',
'CFAP99',
'CNKSR3',
'CYYR1',
'DBX2',
'DENND6A-AS1',
'DEPDC7',
'DTNA',
'FNDC5',
'GJB6',
'GJC2',
'GNG12',
'HCG9',
'HYDIN',
'IGLC4',
'IGLV5-37',
'KLHL13',
'KRT2',
'LAMA2',
'LCN2',
'LIM2',
'LIN28A',
'LINC00640',
'LINC00997',
'LINC01133',
'LINC01832',
'LINC01891',
'LINC02267',
'LNCOC1',
'MGAM',
'MOBP',
'MUCL3',
'NGFR',
'NKD2',
'NKX6-3',
'OPA1-AS1',
'OR2B11',
'PHACTR3',
'PLSCR2',
'PRLR',
'PRRT3-AS1',
'PTGES',
'RARRES2',
'REG1A',
'RNF182',
'SLC10A5',
'SLC17A7',
'SLC26A8',
'SLC28A2',
'SLC2A4',
'SLC6A3',
'SRRM5',
'STAP2',
'STARD6',
'TDRD1',
'TDRP',
'TECTA',
'TMEM255A',
'TMEM72-AS1',
'ZC2HC1B',
'ZNF462',
'ZSCAN10'}),
frozenset({'ACTN3',
'ACVR2B-AS1',
'C10orf105',
'CALML6',
'CDC42EP1',
'DLGAP2',
'DPYD-IT1',
'EPHB2',
'ESCO2',
'FAM174A-DT',
'FOXD3',
'H3C8',
'ICA1-AS1',
'LINC01986',
'LINC02348',
'LINC02569',
'LINC02615',
'MROH8',
'PAPPA-AS1',
'PDK4-AS1',
'PRKD3-DT',
'RAB6C',
'RGPD6',
'SLC12A3',
'TFAP2A',
'UBE2Q2P16',
'WNT3A',
'ZNF503'}),
frozenset({'BSN',
'C12orf71',
'CADM4',
'CCK',
'CLGN',
'COPDA1',
'DAGLA',
'DBH-AS1',
'FAM20C',
'IGHV3-32',
'IGKV3D-11',
'KCNQ4',
'LGALSL-DT',
'LINC01050',
'LINC01503',
'NKX6-2',
'PGAM2',
'PPP1R9A-AS1',
'PRR15',
'REEP1',
'RGL3',
'SMIM17',
'SPDYE21',
'STON2',
'TWIST2',
'ZNF32-AS2',
'ZSCAN2-AS1'}),
frozenset({'ALDH8A1',
'CUX2',
'EPHA2',
'GLP2R',
'IGHV3-69-1',
'IGHV3-73',
'MAG',
'MEX3B',
'MOCS1',
'MTOR-AS1',
'MYO3B',
'OR2AK2',
'PCSK1',
'PITPNM2-AS1',
'PTPRD-AS1',
'SCT',
'SLC5A5',
'ST8SIA5',
'TAF1L',
'TKTL2',
'ZNF474'}),
frozenset({'ADGRG3',
'BRME1',
'CFAP141',
'CLEC1B',
'COL11A2',
'DPYSL4',
'FAM186B',
'FSTL1',
'LDHAL6A',
'LINC00200',
'LINC01108',
'LINC02880',
'PRUNE2',
'RARRES1',
'SCARF2',
'SMIM10L2B-AS1',
'TRBJ2-4',
'UBL4B',
'VWA5B1',
'ZNF491'}),
frozenset({'ACTL7B',
'AMPD1',
'ATP8A1-DT',
'DEPTOR-AS1',
'EDNRB-AS1',
'H3C12',
'HEATR4',
'IGKV1D-13',
'IGKV2D-30',
'LINC01823',
'LINC01990',
'MIPOL1',
'MIR130AHG',
'OR6C6',
'SLC16A14',
'SLC1A2-AS1',
'SLC49A3',
'SLC4A9',
'TGFB2-AS1'}),
frozenset({'C1orf50-AS1',
'FOXI1',
'GYS2',
'HCG22',
'HDHD5-AS1',
'IGHV1-14',
'LINC01579',
'LINC02055',
'LINC03021_ENSG00000254319',
'MYLK3',
'NAALADL2',
'NLRP6',
'PCDHGA3',
'PRSS45P',
'RNF217-AS1',
'UNC79',
'VSTM1',
'VWA7'}),
frozenset({'CASC9',
'CCDC28A-AS1',
'CFAP57',
'CYP2S1',
'FREY1',
'GCNT3',
'IGHV7-4-1',
'IGLV5-45',
'LINC00654',
'NHS',
'RHOXF1-AS1',
'RNF112',
'SLITRK5',
'THTPA',
'TMPRSS3',
'TRMT9B',
'TRPM1'}),
frozenset({'ASIC3',
'BHLHE22',
'C10orf71',
'CT69',
'DNMBP-AS1',
'EDAR',
'LIF',
'MS4A4E',
'S100A16',
'SLC22A1',
'TRAV29DV5',
'TTLL7'}),
frozenset({'ACTA2-AS1',
'CDH23',
'CHRNG',
'FAM182A',
'LINC02660',
'NKX6-1',
'PACRG',
'TAS2R39'}),
frozenset({'ASGR2', 'GOLGA8H', 'IGKV2-26', 'LINC02057', 'LRRC49', 'TEKTIP1'}),
frozenset({'CBY3', 'CYP2C8', 'LINC01301', 'PELP1-DT', 'PTGR1'}),
frozenset({'DLG1-AS1', 'IQSEC2', 'KCNH6', 'LINC01980', 'LPAR4'}),
frozenset({'DUOX2', 'HSD17B2-AS1', 'LINC01115_ENSG00000237667', 'MYO3A'}),
frozenset({'FCN3', 'PCDHB4', 'S100A5'}),
frozenset({'CNTNAP3', 'IGFBP2', 'USP43'}),
frozenset({'NPM2', 'RHBDF1'}),
frozenset({'SERTAD3-AS1', 'STXBP6'})]
# Initialise a new graph for community visualisation
community_graph = nx.Graph()
# Add edges between communities based on their connectivity
for i, community in enumerate(communities):
for node in community:
community_graph.add_node(node)
for neighbor in G.neighbors(node):
if neighbor in community:
community_graph.add_edge(node, neighbor, weight=G[node][neighbor]['weight'])
# Initialise an empty list to store separated communities
separated_communities = []
# Iterate over the detected communities
for community in communities:
# Convert the set of nodes into a list and append it to the list of lists
separated_communities.append(list(community))
# Print or use the separated communities as needed
print(separated_communities)
[['NKX6-3', 'ZC2HC1B', 'ZSCAN10', 'HCG9', 'CC2D2B', 'DBX2', 'MOBP', 'STAP2', 'LCN2', 'ZNF462', 'TMEM72-AS1', 'LNCOC1', 'SLC28A2', 'SLC10A5', 'SRRM5', 'SLC17A7', 'OPA1-AS1', 'DEPDC7', 'LIN28A', 'SLC6A3', 'GJB6', 'TMEM255A', 'IGLC4', 'LINC00640', 'DENND6A-AS1', 'FNDC5', 'LAMA2', 'CYYR1', 'CNKSR3', 'RNF182', 'TECTA', 'KRT2', 'PRRT3-AS1', 'LINC02267', 'MUCL3', 'CFAP99', 'CCNA1', 'SLC26A8', 'GNG12', 'PHACTR3', 'LINC01133', 'C21orf62', 'ANKRD29', 'NGFR', 'NKD2', 'GJC2', 'LIM2', 'ANO1', 'LINC00997', 'STARD6', 'ARHGAP23', 'HYDIN', 'KLHL13', 'IGLV5-37', 'PLSCR2', 'TDRP', 'REG1A', 'LINC01832', 'ARHGEF39', 'CDH6', 'LINC01891', 'MGAM', 'AKAP14', 'TDRD1', 'PRLR', 'RARRES2', 'SLC2A4', 'OR2B11', 'DTNA', 'PTGES'], ['ACTN3', 'PDK4-AS1', 'C10orf105', 'ACVR2B-AS1', 'LINC02348', 'DPYD-IT1', 'UBE2Q2P16', 'PRKD3-DT', 'LINC01986', 'CALML6', 'EPHB2', 'PAPPA-AS1', 'DLGAP2', 'MROH8', 'RAB6C', 'ZNF503', 'H3C8', 'LINC02569', 'CDC42EP1', 'TFAP2A', 'ESCO2', 'WNT3A', 'SLC12A3', 'RGPD6', 'ICA1-AS1', 'FOXD3', 'LINC02615', 'FAM174A-DT'], ['IGHV3-32', 'LINC01050', 'STON2', 'CLGN', 'CCK', 'RGL3', 'COPDA1', 'PPP1R9A-AS1', 'SPDYE21', 'BSN', 'PRR15', 'IGKV3D-11', 'C12orf71', 'SMIM17', 'FAM20C', 'KCNQ4', 'PGAM2', 'DAGLA', 'NKX6-2', 'REEP1', 'LGALSL-DT', 'ZNF32-AS2', 'ZSCAN2-AS1', 'TWIST2', 'LINC01503', 'CADM4', 'DBH-AS1'], ['MAG', 'MOCS1', 'ALDH8A1', 'MTOR-AS1', 'MEX3B', 'MYO3B', 'GLP2R', 'SCT', 'TAF1L', 'IGHV3-73', 'OR2AK2', 'PTPRD-AS1', 'ST8SIA5', 'PITPNM2-AS1', 'EPHA2', 'TKTL2', 'SLC5A5', 'PCSK1', 'CUX2', 'IGHV3-69-1', 'ZNF474'], ['LINC02880', 'COL11A2', 'SCARF2', 'ADGRG3', 'ZNF491', 'CFAP141', 'FSTL1', 'TRBJ2-4', 'SMIM10L2B-AS1', 'UBL4B', 'CLEC1B', 'DPYSL4', 'LDHAL6A', 'PRUNE2', 'LINC00200', 'BRME1', 'FAM186B', 'VWA5B1', 'RARRES1', 'LINC01108'], ['SLC1A2-AS1', 'MIR130AHG', 'HEATR4', 'SLC49A3', 'OR6C6', 'SLC4A9', 'IGKV1D-13', 'ACTL7B', 'LINC01823', 'MIPOL1', 'AMPD1', 'IGKV2D-30', 'SLC16A14', 'DEPTOR-AS1', 'EDNRB-AS1', 'H3C12', 'LINC01990', 'ATP8A1-DT', 'TGFB2-AS1'], ['NLRP6', 'PRSS45P', 'NAALADL2', 'VSTM1', 'MYLK3', 'FOXI1', 'PCDHGA3', 'VWA7', 'HDHD5-AS1', 'LINC03021_ENSG00000254319', 'LINC01579', 'IGHV1-14', 'HCG22', 'LINC02055', 'UNC79', 'RNF217-AS1', 'C1orf50-AS1', 'GYS2'], ['CASC9', 'CCDC28A-AS1', 'TRPM1', 'CYP2S1', 'TMPRSS3', 'THTPA', 'GCNT3', 'FREY1', 'RHOXF1-AS1', 'NHS', 'IGHV7-4-1', 'SLITRK5', 'IGLV5-45', 'TRMT9B', 'CFAP57', 'RNF112', 'LINC00654'], ['ASIC3', 'SLC22A1', 'BHLHE22', 'LIF', 'S100A16', 'CT69', 'TRAV29DV5', 'C10orf71', 'TTLL7', 'MS4A4E', 'DNMBP-AS1', 'EDAR'], ['FAM182A', 'PACRG', 'NKX6-1', 'ACTA2-AS1', 'TAS2R39', 'CHRNG', 'LINC02660', 'CDH23'], ['TEKTIP1', 'ASGR2', 'IGKV2-26', 'LINC02057', 'GOLGA8H', 'LRRC49'], ['CYP2C8', 'PELP1-DT', 'LINC01301', 'CBY3', 'PTGR1'], ['KCNH6', 'LINC01980', 'DLG1-AS1', 'IQSEC2', 'LPAR4'], ['MYO3A', 'HSD17B2-AS1', 'DUOX2', 'LINC01115_ENSG00000237667'], ['FCN3', 'PCDHB4', 'S100A5'], ['USP43', 'IGFBP2', 'CNTNAP3'], ['NPM2', 'RHBDF1'], ['STXBP6', 'SERTAD3-AS1']]
len(communities)
18
import pickle
# Save the separated_communities list as a pkl file
file_path = '/ReCoDE-Gene-Network-Analysis/data/other/separated_communities.pkl'
with open(file_path, 'wb') as file:
pickle.dump(separated_communities, file)
Step 6: Investigate Chosen Community/Sub-Network¶
# Choose the index of the community you want to visualise
community_index = 0 # Change this to the index of the community you want to visualise
# Get the nodes in the selected community
selected_community = list(communities[community_index])
# Create a subgraph containing only the nodes and edges within the selected community
subgraph = G.subgraph(selected_community)
# Assuming subgraph is already defined
num_nodes = subgraph.number_of_nodes()
print(f"The number of nodes in the subgraph is: {num_nodes}")
The number of nodes in the subgraph is: 70
Step 7: Visualise the Community/Sub-Network¶
# Visualise the subgraph
pos = nx.spring_layout(subgraph, k=0.8, iterations=20) # You can use different layout algorithms if needed
plt.figure(figsize=(14, 12))
nx.draw(subgraph, pos, with_labels=True, node_color='skyblue', node_size=100, edge_color='gray', linewidths=0.1, font_size=8)
plt.title('Community Visualisation')
plt.show()
Play around with different layouts and see how the visualisation is affected. For example, there is circular_layout. Check out https://networkx.org/documentation/stable/tutorial.html#drawing-graphs for more information.
# Visualise the subgraph
pos = nx.circular_layout(subgraph) # You can use different layout algorithms if needed
plt.figure(figsize=(14, 12))
nx.draw(subgraph, pos, with_labels=True, node_color='skyblue', node_size=100, edge_color='gray', linewidths=0.1, font_size=8)
plt.title('Community Visualisation')
plt.show()
As can be seen here, the circular layout is not really suitable due to the high number of genes within the sub-network that have high density of edges.
Since the subnetwork itself is too large to visualise, we shall visualise a random sample of the subnetwork.
# Select 10 random nodes from the original subgraph
selected_nodes = random.sample(subgraph.nodes(), 10)
# Create a new subgraph containing only the selected nodes and their neighbors
reduced_subgraph = subgraph.subgraph(selected_nodes)
# Visualise the subgraph
pos = nx.circular_layout(reduced_subgraph) # You can use different layout algorithms if needed
plt.figure(figsize=(14, 12))
nx.draw(reduced_subgraph, pos, with_labels=True, node_color='skyblue', node_size=100, edge_color='gray', linewidths=0.1, font_size=8)
plt.title('Community Visualisation')
plt.show()
Try out a different visualisation technique with the randomly sampled subnetwork
# Visualise the subgraph
pos = nx.spectral_layout(reduced_subgraph) # You can use different layout algorithms if needed
plt.figure(figsize=(14, 12))
nx.draw(reduced_subgraph, pos, with_labels=True, node_color='skyblue', node_size=1000, edge_color='gray', linewidths=0.1, font_size=10)
plt.title('Community Visualisation')
plt.show()
External Reading Links:
- WGCNA package: https://web.archive.org/web/20230323144343/horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/
- WGCNA paper: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-559
- Correlation analysis: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5079093/
- Quick introduction page to correlations in statistics: https://www.jmp.com/en_gb/statistics-knowledge-portal/what-is-correlation.html
Exercise Questions:
- What is a gene co-expression network, and why is it important in understanding biological systems?
- Explain the concept of Pearson correlation and its use in gene expression analysis.
- What other correlation measures could be used?
- Why do we convert a correlation matrix to a distance matrix in hierarchical clustering?
- What other distance measures could be used?
- What is hierarchical clustering, and how is it applied in the context of gene co-expression networks?
- Are there other community-based algorithms that could be used instead?
- Show a worked through example testing different correlation measures, distance measures and community algorithms and see how this affects the networks produced.
Answers:
- A gene co-expression network is a type of biological network where nodes represent genes and edges represent significant co-expression relationships between them. These networks are important because they help identify groups of genes (modules) that are co-expressed under various conditions, which can reveal functional relationships and regulatory mechanisms within the biological system. Understanding these networks can lead to insights into gene functions, interactions, and their roles in diseases.
- Pearson correlation is a measure of the linear relationship between two variables, providing a value between -1 and 1. In gene expression analysis, Pearson correlation is used to quantify the degree to which the expression levels of two genes vary together across different samples or conditions. A value close to 1 indicates strong positive correlation, -1 indicates strong negative correlation, and 0 indicates no linear correlation. This measure helps identify gene pairs that might be co-regulated or involved in the same biological processes.
- Spearman Rank Correlation measures the strength and direction of the monotonic relationship between two ranked variables. It is useful for ordinal data or when the relationship isn't linear. Kendall Tau Correlation measures the association between two ranked variables. It is suitable for small sample sizes or data with many tied ranks.
- We convert a correlation matrix to a distance matrix because hierarchical clustering algorithms require a measure of dissimilarity between pairs of genes. The distance matrix represents this dissimilarity, with smaller values indicating more similar (correlated) gene expression profiles. This conversion allows the clustering algorithm to group similar genes together based on their expression patterns, facilitating the identification of co-expressed gene modules.
- Euclidean Distance is the straight-line distance between two points in Euclidean space. Manhattan Distance is the sum of absolute differences between the coordinates of two points. Minkowski Distance is a generalisation that includes both Euclidean and Manhattan distances. By providing a comprehensive way to quantify relationships between objects, distance matrices play a fundamental role in data analysis, pattern recognition, and various scientific research fields.
- Hierarchical clustering is a method of cluster analysis which seeks to build a hierarchy of clusters. It is applied in gene co-expression networks to group genes with similar expression patterns. The process starts with each gene as its own cluster and iteratively merges the most similar clusters based on their distance, forming a tree-like structure called a dendrogram. This helps visualise the relationships between genes and identify clusters of co-expressed genes.
- The Louvain method is an efficient algorithm for community detection that optimises modularity. The Girvan-Newman algorithm detects communities by progressively removing edges with the highest betweenness centrality.
Question 8:
#Calculate correlation using Spearman Rank Correlation:
correlation_matrix2 = datExpr.corr(method = 'spearman')
correlation_matrix2
| ISG15 | LINC01342 | TTLL10-AS1 | TNFRSF18 | CALML6 | CHD5 | ICMT-DT | MIR34AHG | RBP7 | MTOR-AS1 | ... | FRMPD3 | TSC22D3 | KLHL13 | AKAP14 | RHOXF1-AS1 | TMEM255A | SMIM10L2B-AS1 | IL9R_ENSG00000124334 | DDX3Y | EIF1AY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ISG15 | 1.000000 | 0.016319 | -0.056208 | 0.686548 | -0.165674 | -0.037170 | 0.049862 | -0.069807 | 0.222848 | -0.075307 | ... | 0.369104 | 0.486957 | 0.165674 | 0.165674 | 0.105429 | 0.165674 | 0.135552 | 0.266461 | 0.116547 | 0.203655 |
| LINC01342 | 0.016319 | 1.000000 | 0.452741 | -0.237069 | -0.062810 | -0.090737 | 0.452741 | -0.090737 | 0.261696 | 0.722315 | ... | 0.321733 | 0.320931 | -0.062810 | -0.062810 | 0.659505 | -0.062810 | -0.062810 | -0.134016 | 0.140551 | -0.009074 |
| TTLL10-AS1 | -0.056208 | 0.452741 | 1.000000 | -0.149824 | -0.062810 | -0.090737 | -0.090737 | -0.090737 | -0.153188 | -0.062810 | ... | 0.359584 | 0.269256 | -0.062810 | -0.062810 | 0.722315 | -0.062810 | -0.062810 | -0.134016 | -0.192238 | -0.389263 |
| TNFRSF18 | 0.686548 | -0.237069 | -0.149824 | 1.000000 | -0.136595 | 0.265847 | -0.063493 | -0.040654 | -0.004319 | -0.273190 | ... | 0.294203 | 0.108656 | 0.288367 | 0.288367 | -0.045532 | 0.288367 | 0.166950 | 0.193963 | 0.016433 | 0.368349 |
| CALML6 | -0.165674 | -0.062810 | -0.062810 | -0.136595 | 1.000000 | -0.062810 | -0.062810 | -0.062810 | -0.106040 | -0.043478 | ... | -0.078603 | 0.195797 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.092769 | -0.165710 | -0.165819 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TMEM255A | 0.165674 | -0.062810 | -0.062810 | 0.288367 | -0.043478 | -0.062810 | -0.062810 | -0.062810 | -0.106040 | -0.043478 | ... | -0.078603 | 0.165674 | 1.000000 | 1.000000 | -0.043478 | 1.000000 | -0.043478 | -0.092769 | 0.286227 | 0.316563 |
| SMIM10L2B-AS1 | 0.135552 | -0.062810 | -0.062810 | 0.166950 | -0.043478 | -0.062810 | 0.722315 | -0.062810 | -0.106040 | -0.043478 | ... | -0.078603 | 0.015061 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | 1.000000 | 0.487035 | 0.135581 | 0.256265 |
| IL9R_ENSG00000124334 | 0.266461 | -0.134016 | -0.134016 | 0.193963 | -0.092769 | 0.267335 | 0.302235 | -0.134016 | -0.008485 | -0.092769 | ... | -0.167714 | 0.181434 | -0.092769 | -0.092769 | -0.092769 | -0.092769 | 0.487035 | 1.000000 | -0.108148 | 0.060307 |
| DDX3Y | 0.116547 | 0.140551 | -0.192238 | 0.016433 | -0.165710 | -0.246645 | 0.279289 | -0.265687 | 0.304952 | 0.256098 | ... | -0.395663 | 0.203523 | 0.286227 | 0.286227 | -0.075323 | 0.286227 | 0.135581 | -0.108148 | 1.000000 | 0.829598 |
| EIF1AY | 0.203655 | -0.009074 | -0.389263 | 0.368349 | -0.165819 | -0.068960 | 0.328469 | -0.286730 | 0.251229 | 0.195968 | ... | -0.411819 | 0.026110 | 0.316563 | 0.316563 | -0.226116 | 0.316563 | 0.256265 | 0.060307 | 0.829598 | 1.000000 |
1000 rows × 1000 columns
Calculate distance matrix using Minkowski Distance
Convert correlation matrix to distance matrix Here, we use 1 - correlation as the distance measure by first calculating the dissimilarity
distance_matrix2 = 1 - correlation_matrix2
# Calculate the Minkowski distance (p = 2 for Euclidean, p = 1 for Manhattan, etc.)
p = 3 # You can change this to any value of p
minkowski_distances = pdist(distance_matrix2, metric='minkowski', p=p)
# Convert back to square form
distance_matrix_minkowski = squareform(minkowski_distances)
distance_matrix_minkowski
array([[0. , 4.97451632, 5.31047871, ..., 4.11670602, 4.41095818,
4.07169558],
[4.97451632, 0. , 2.81633536, ..., 3.94219913, 3.39496526,
4.15888346],
[5.31047871, 2.81633536, 0. , ..., 3.8634609 , 3.85574122,
4.57033685],
...,
[4.11670602, 3.94219913, 3.8634609 , ..., 0. , 3.54310329,
3.11503694],
[4.41095818, 3.39496526, 3.85574122, ..., 3.54310329, 0. ,
1.79073525],
[4.07169558, 4.15888346, 4.57033685, ..., 3.11503694, 1.79073525,
0. ]])
# Convert back to DataFrame to keep row and column names
#In order to do that save the row and column names from the original correlation matrix first:
row_labels = correlation_matrix.index
column_labels = correlation_matrix.columns
#Convert the distance matrix into a dataframe
distance_df = pd.DataFrame(distance_matrix_minkowski, index=row_labels, columns=column_labels)
distance_df
| ISG15 | LINC01342 | TTLL10-AS1 | TNFRSF18 | CALML6 | CHD5 | ICMT-DT | MIR34AHG | RBP7 | MTOR-AS1 | ... | FRMPD3 | TSC22D3 | KLHL13 | AKAP14 | RHOXF1-AS1 | TMEM255A | SMIM10L2B-AS1 | IL9R_ENSG00000124334 | DDX3Y | EIF1AY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ISG15 | 0.000000 | 4.974516 | 5.310479 | 2.479338 | 5.256966 | 5.137140 | 4.833509 | 6.154665 | 4.101156 | 5.488098 | ... | 3.468319 | 3.099291 | 4.518263 | 4.518263 | 4.631243 | 4.518263 | 4.700795 | 4.116706 | 4.410958 | 4.071696 |
| LINC01342 | 4.974516 | 0.000000 | 2.816335 | 5.643094 | 4.218496 | 4.195265 | 3.144677 | 4.192129 | 3.099887 | 2.302618 | ... | 3.323716 | 4.142144 | 5.375076 | 5.375076 | 2.521129 | 5.375076 | 4.265176 | 3.942199 | 3.394965 | 4.158883 |
| TTLL10-AS1 | 5.310479 | 2.816335 | 0.000000 | 5.641991 | 4.085150 | 3.976405 | 3.911277 | 3.799220 | 3.834896 | 3.974338 | ... | 3.049130 | 4.647630 | 5.344209 | 5.344209 | 1.850177 | 5.344209 | 4.147613 | 3.863461 | 3.855741 | 4.570337 |
| TNFRSF18 | 2.479338 | 5.643094 | 5.641991 | 0.000000 | 5.508208 | 4.336978 | 4.984989 | 5.608852 | 4.892775 | 5.900174 | ... | 3.693057 | 4.861216 | 4.060531 | 4.060531 | 5.197886 | 4.060531 | 4.323955 | 3.887995 | 4.615055 | 3.413775 |
| CALML6 | 5.256966 | 4.218496 | 4.085150 | 5.508208 | 0.000000 | 4.506604 | 4.367332 | 4.538077 | 4.231430 | 4.442941 | ... | 4.214099 | 4.341695 | 5.488562 | 5.488562 | 4.288198 | 5.488562 | 4.537409 | 4.287737 | 4.430332 | 4.692939 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TMEM255A | 4.518263 | 5.375076 | 5.344209 | 4.060531 | 5.488562 | 5.278605 | 5.328074 | 5.496537 | 5.339846 | 5.455101 | ... | 5.181272 | 4.935519 | 0.000000 | 0.000000 | 5.318906 | 0.000000 | 5.310234 | 5.211184 | 3.732318 | 3.549149 |
| SMIM10L2B-AS1 | 4.700795 | 4.265176 | 4.147613 | 4.323955 | 4.537409 | 3.970086 | 2.314571 | 3.959092 | 4.193764 | 4.310500 | ... | 4.015744 | 5.423327 | 5.310234 | 5.310234 | 4.340555 | 5.310234 | 0.000000 | 2.426073 | 3.434198 | 3.095993 |
| IL9R_ENSG00000124334 | 4.116706 | 3.942199 | 3.863461 | 3.887995 | 4.287737 | 2.609322 | 2.953918 | 3.696751 | 3.531675 | 3.992939 | ... | 3.640487 | 4.822124 | 5.211184 | 5.211184 | 4.059895 | 5.211184 | 2.426073 | 0.000000 | 3.543103 | 3.115037 |
| DDX3Y | 4.410958 | 3.394965 | 3.855741 | 4.615055 | 4.430332 | 4.203337 | 2.995581 | 3.928682 | 2.754223 | 3.155120 | ... | 4.137625 | 4.486492 | 3.732318 | 3.732318 | 4.001664 | 3.732318 | 3.434198 | 3.543103 | 0.000000 | 1.790735 |
| EIF1AY | 4.071696 | 4.158883 | 4.570337 | 3.413775 | 4.692939 | 3.711069 | 3.040494 | 3.978252 | 3.405516 | 3.697280 | ... | 4.162647 | 5.164845 | 3.549149 | 3.549149 | 4.624298 | 3.549149 | 3.095993 | 3.115037 | 1.790735 | 0.000000 |
1000 rows × 1000 columns
#The linkage function is used to perform hierarchical clustering on the distance matrix.
linkage_matrix2 = sch.linkage(distance_df, method='average')
# Plot the dendrogram
plt.figure(figsize=(10, 6))
dendrogram = sch.dendrogram(linkage_matrix2)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()
/tmp/ipykernel_128889/2932722225.py:2: ClusterWarning: scipy.cluster: The symmetric non-negative hollow observation matrix looks suspiciously like an uncondensed distance matrix linkage_matrix2 = sch.linkage(distance_df, method='average')
# Define threshold for significant edges (you can adjust this based on your requirement)
threshold = 0.7
# Threshold the correlation matrix to determine significant edges
significant_edges2 = correlation_matrix2 > threshold
significant_edges2
| ISG15 | LINC01342 | TTLL10-AS1 | TNFRSF18 | CALML6 | CHD5 | ICMT-DT | MIR34AHG | RBP7 | MTOR-AS1 | ... | FRMPD3 | TSC22D3 | KLHL13 | AKAP14 | RHOXF1-AS1 | TMEM255A | SMIM10L2B-AS1 | IL9R_ENSG00000124334 | DDX3Y | EIF1AY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ISG15 | True | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| LINC01342 | False | True | False | False | False | False | False | False | False | True | ... | False | False | False | False | False | False | False | False | False | False |
| TTLL10-AS1 | False | False | True | False | False | False | False | False | False | False | ... | False | False | False | False | True | False | False | False | False | False |
| TNFRSF18 | False | False | False | True | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| CALML6 | False | False | False | False | True | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TMEM255A | False | False | False | False | False | False | False | False | False | False | ... | False | False | True | True | False | True | False | False | False | False |
| SMIM10L2B-AS1 | False | False | False | False | False | False | True | False | False | False | ... | False | False | False | False | False | False | True | False | False | False |
| IL9R_ENSG00000124334 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | True | False | False |
| DDX3Y | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | True | True |
| EIF1AY | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | True | True |
1000 rows × 1000 columns
#networkx is a python library designed for network analysis.
# Construct the network from significant edges.
G2 = nx.Graph()
# Loop through significant edges and add edges to the graph
for i in range(significant_edges2.shape[0]):
for j in range(significant_edges2.shape[1]):
if significant_edges2.values[i, j]:
gene1 = significant_edges2.index[i]
gene2 = significant_edges2.columns[j]
# Find the position of gene1 and gene2 in the correlation matrix
i_corr = np.where(correlation_matrix2.index == gene1)[0][0]
j_corr = np.where(correlation_matrix2.columns == gene2)[0][0]
# Add the edge to the graph
G2.add_edge(gene1, gene2, weight=correlation_matrix2.iloc[i_corr, j_corr])
# Display the number of nodes and edges in the graph
print("Number of nodes:", G2.number_of_nodes())
print("Number of edges:", G2.number_of_edges())
Number of nodes: 1000 Number of edges: 30389
Analyse the network: For example, you can identify clusters/modules using community detection algorithms Connected_components is a quick and simple community detection algorithm. While this method may not be as sophisticated as algorithms like Louvain or Girvan-Newman, it can provide a basic partitioning of the graph into communities.
communities2 = list(nx.connected_components(G2))
communities2
[{'ABALON',
'ACP5',
'ACSL3-AS1',
'ACSM3',
'ACTA2-AS1',
'ACTG1',
'ACTN3',
'ACVR2B-AS1',
'ADAM19',
'ADAMTS13',
'ADARB2',
'ADGRG3',
'AFF3',
'AHSP',
'AIFM3',
'AIM2',
'AKAP13-AS1',
'AKAP14',
'ALDH1A1',
'ALDH1L2',
'ALDH8A1',
'ALKAL2',
'AMOTL1',
'ANKRD28',
'ANKRD29',
'ANKRD44-DT',
'ANKRD61',
'ANO1',
'ANO8',
'ANXA1',
'APOE',
'AREG',
'ARHGAP15',
'ARHGAP17',
'ARHGAP22',
'ARHGAP23',
'ARHGAP29',
'ARHGAP39',
'ARHGEF10L',
'ARHGEF39',
'ARID5B',
'ARL4A',
'ARL4C',
'ARPP21',
'ASGR1',
'ASGR2',
'ASIC3',
'ASPN',
'ATF7IP2',
'ATP1A3',
'ATP1B3',
'ATP2B1-AS1',
'AVP',
'AZU1',
'BANK1',
'BASP1',
'BAZ2B',
'BCAR4',
'BCL11B',
'BCL2A1',
'BCL7A',
'BEX1',
'BHLHE22',
'BHLHE40',
'BMP3',
'BORCS5',
'BRME1',
'BSN',
'BTG1-DT',
'C10orf105',
'C10orf71',
'C12orf71',
'C15orf48',
'C16orf74',
'C18orf63',
'C19orf67',
'C1QTNF7',
'C1orf131',
'C1orf162',
'C1orf50-AS1',
'C1orf56',
'C20orf144',
'C21orf62',
'C7orf50',
'CA1',
'CACTIN-AS1',
'CADM4',
'CALCA',
'CALHM6',
'CALML6',
'CAMK2D',
'CAPN14',
'CASC9',
'CAVIN2',
'CAVIN3',
'CC2D2B',
'CCDC110',
'CCDC28A-AS1',
'CCK',
'CCL22',
'CCL3',
'CCL3L3',
'CCL4',
'CCL4L2',
'CCL5',
'CCL7',
'CCN1',
'CCNA1',
'CCND2',
'CCR7',
'CD109',
'CD19',
'CD2',
'CD22',
'CD24',
'CD2AP-DT',
'CD3D',
'CD3G',
'CD52',
'CD69',
'CD70',
'CD72',
'CD79B',
'CD83',
'CDC42EP1',
'CDCA7L',
'CDH13-AS2',
'CDH23',
'CDH24',
'CDH6',
'CEACAM1',
'CELSR2',
'CELSR3',
'CEP290',
'CEP295NL',
'CERCAM',
'CFAP126',
'CFAP141',
'CFAP157',
'CFAP418-AS1',
'CFAP57',
'CFAP99',
'CHD5',
'CHML',
'CHMP1B',
'CHMP1B-AS1',
'CHRNG',
'CITED2',
'CKS2',
'CLDN11',
'CLEC10A',
'CLEC1A',
'CLEC1B',
'CLEC2D',
'CLGN',
'CLU',
'CMSS1',
'CMTM5',
'CNKSR3',
'CNTNAP2',
'CNTNAP3',
'COL11A2',
'COL19A1',
'COLEC12',
'COPDA1',
'COQ7',
'CPLANE1-AS1',
'CREM',
'CRIP1',
'CRIP2',
'CSRP2',
'CST7',
'CT69',
'CTAGE9',
'CUX2',
'CXCL10',
'CXCL2',
'CXCL3',
'CXCL8',
'CXCR1',
'CYB561A3',
'CYP26C1',
'CYP2D6',
'CYP2S1',
'CYP2U1-AS1',
'CYTOR',
'CYYR1',
'DAAM1',
'DAGLA',
'DBH-AS1',
'DBNDD1',
'DBX2',
'DCTN1-AS1',
'DENND6A-AS1',
'DEPDC7',
'DGKD',
'DINOL',
'DIO3',
'DLG1-AS1',
'DLGAP2',
'DNAJB1',
'DNAJC12',
'DNAJC9-AS1_ENSG00000236756',
'DNMBP-AS1',
'DOCK8-AS1',
'DPYD-IT1',
'DPYSL3',
'DPYSL4',
'DSG2',
'DTNA',
'DUOX2',
'DUSP1',
'DUSP2',
'DUSP4',
'DYNLRB2-AS1',
'EAF2',
'ECEL1',
'ECM1',
'EDAR',
'EGR1',
'EGR3',
'ELF3',
'ELFN1-AS1',
'EML1',
'EPB41L1',
'EPHA2',
'EPHB2',
'EPHB4',
'EPSTI1',
'EQTN',
'ERCC2',
'ESCO2',
'ETV5',
'FABP5',
'FAM106A',
'FAM111A',
'FAM111B',
'FAM174A-DT',
'FAM182A',
'FAM186B',
'FAM200B',
'FAM20C',
'FAM30A',
'FBLIM1',
'FBXO22',
'FCER1G',
'FCER2',
'FCHSD2',
'FCN3',
'FCRL1',
'FCRL2',
'FCRL5',
'FCRLA',
'FGFBP2',
'FMOD',
'FNDC5',
'FOS',
'FOSB',
'FOXD3',
'FOXI1',
'FOXP1-AS1',
'FRA10AC1',
'FREY1',
'FRMPD3',
'FSTL1',
'FTH1',
'GABARAPL1',
'GABRB2',
'GADD45B',
'GAPDH',
'GAPT',
'GARIN4',
'GAS5-AS1',
'GBP1',
'GBP2',
'GBP4',
'GBP5',
'GCM1',
'GCNT3',
'GIHCG',
'GIMAP7',
'GJB6',
'GJC2',
'GJD3',
'GLA',
'GLP2R',
'GNG12',
'GNLY',
'GOLGA8H',
'GOLGA8R',
'GORAB-AS1',
'GP1BB',
'GP9',
'GPR137C',
'GPR183',
'GPR3',
'GPRC5A',
'GRB10',
'GRIN2B',
'GSTA1',
'GUCD1',
'GYS2',
'GZMA',
'GZMB',
'GZMH',
'GZMM',
'H1-10',
'H1-3',
'H2AC6',
'H3-3A-DT',
'H3C3',
'H3C8',
'HBD',
'HCAR2',
'HCCS-DT',
'HCG22',
'HCG9',
'HCN3',
'HCST',
'HDAC9',
'HDHD5-AS1',
'HELQ',
'HELZ-AS1',
'HERC5',
'HES1',
'HEXIM1',
'HEY1',
'HGF',
'HIP1R',
'HLA-DMB',
'HLA-DOB',
'HOPX',
'HOXA5',
'HSD17B2-AS1',
'HSPA1A',
'HSPA1B',
'HSPA5-DT',
'HSPA6',
'HSPB1',
'HSPH1',
'HYDIN',
'ICA1-AS1',
'ICMT-DT',
'ICOSLG',
'ID1',
'ID2',
'ID3',
'IFI30',
'IFIT2',
'IFRD1',
'IGFBP2',
'IGHA1',
'IGHA2',
'IGHD',
'IGHE',
'IGHG1',
'IGHG3',
'IGHG4',
'IGHM',
'IGHV1-14',
'IGHV1-18',
'IGHV1-69',
'IGHV2-5',
'IGHV2-70',
'IGHV3-11',
'IGHV3-32',
'IGHV3-69-1',
'IGHV3-72',
'IGHV3-73',
'IGHV4-34',
'IGHV7-4-1',
'IGKC',
'IGKV1-6',
'IGKV1-9',
'IGKV1D-39',
'IGKV2-26',
'IGKV2D-29',
'IGKV3D-11',
'IGKV4-1',
'IGLC1',
'IGLC2',
'IGLC3',
'IGLC4',
'IGLC6',
'IGLC7',
'IGLV1-44',
'IGLV2-18',
'IGLV3-10',
'IGLV3-16',
'IGLV3-9',
'IGLV5-37',
'IGLV5-45',
'IL11',
'IL32',
'IL4R',
'IL7R',
'IL9R_ENSG00000124334',
'ILF3-DT',
'INKA2-AS1',
'IQCA1',
'IQSEC2',
'IRF4',
'IRF8',
'IRS2',
'ISG15',
'ITGB5',
'ITPR1',
'JAG2',
'JARID2-AS1',
'JCHAIN',
'JMY',
'JUN',
'JUNB',
'KBTBD11-AS1',
'KCNE3',
'KCNH6',
'KCNJ1',
'KCNQ1OT1',
'KCNQ4',
'KLF10',
'KLF2',
'KLF3-AS1',
'KLF4',
'KLHL10',
'KLHL13',
'KLHL22',
'KLHL32',
'KLRB1',
'KLRD1',
'KMT2E-AS1',
'KRT2',
'LAMA2',
'LAMP5',
'LANCL3',
'LAPTM4A-DT',
'LARP7',
'LCN2',
'LCNL1',
'LDHAL6A',
'LGALS1',
'LGALS3',
'LGALS9C',
'LGALSL-DT',
'LIF',
'LILRA1',
'LILRA6',
'LIM2',
'LIN28A',
'LINC00106_ENSG00000236871',
'LINC00200',
'LINC00310',
'LINC00520',
'LINC00640',
'LINC00654',
'LINC00964',
'LINC00997',
'LINC01050',
'LINC01104',
'LINC01108',
'LINC01115_ENSG00000237667',
'LINC01133',
'LINC01260',
'LINC01284',
'LINC01342',
'LINC01354',
'LINC01375',
'LINC01389',
'LINC01503',
'LINC01579',
'LINC01764',
'LINC01781',
'LINC01801',
'LINC01832',
'LINC01891',
'LINC01962',
'LINC01980',
'LINC01986',
'LINC01991',
'LINC02029',
'LINC02055',
'LINC02057',
'LINC02076',
'LINC02175',
'LINC02267',
'LINC02285',
'LINC02348',
'LINC02397',
'LINC02540',
'LINC02551',
'LINC02569',
'LINC02595',
'LINC02615',
'LINC02631',
'LINC02660',
'LINC02679',
'LINC02842',
'LINC02851',
'LINC02880',
'LINC02968_ENSG00000223511',
'LINC03019',
'LINC03021_ENSG00000254319',
'LINC03040',
'LINC03065',
'LINC03080',
'LIX1L-AS1',
'LMNA',
'LNCOC1',
'LNP1',
'LOH12CR2',
'LPAR4',
'LPP',
'LRRC49',
'LRRC66',
'LRRC75B',
'LRRN2',
'LRRTM4',
'LSAMP',
'LST1',
'LTB',
'LY6G6C',
'LY86',
'LY9',
'LYPD5',
'LYZ',
'MAD2L1-DT',
'MAG',
'MANSC1',
'MAP3K6',
'MAP3K8',
'MAP7D2',
'MAPK12',
'MARCHF1',
'MARCHF3',
'MARCKS',
'MAS1',
'MBD5',
'MCM10',
'MEG3',
'METTL8',
'MEX3B',
'MGAM',
'MGP',
'MIA2-AS1',
'MINDY2-DT',
'MIR155HG',
'MIR23AHG',
'MIR34AHG',
'MIR4432HG',
'MKI67',
'MMP2-AS1',
'MMP28',
'MOBP',
'MOCS1',
'MROH8',
'MS4A1',
'MS4A4E',
'MSMP',
'MSS51',
'MT2A',
'MTFR2',
'MTOR-AS1',
'MTRFR',
'MTTP',
'MTUS2-AS1',
'MUCL3',
'MX1',
'MX2',
'MYADM',
'MYC',
'MYL3',
'MYL6B-AS1',
'MYLK3',
'MYO10',
'MYO3A',
'MYO3B',
'MYO7B',
'MYOZ3',
'MYZAP',
'MZB1',
'MZF1-AS1',
'NAALADL2',
'NAF1',
'NEIL1',
'NFE2',
'NFKB1',
'NFKBID',
'NGFR',
'NHLH1',
'NHS',
'NIBAN3',
'NIM1K',
'NKD2',
'NKG7',
'NKX6-1',
'NKX6-2',
'NKX6-3',
'NLRP12',
'NLRP6',
'NME4',
'NPM2',
'NR2E3',
'NR4A1',
'NR4A2',
'NR4A3',
'NRDE2',
'NREP',
'NRGN',
'NTN4',
'NUAK2',
'ODAD1',
'ODAD3',
'OMD',
'OPA1-AS1',
'OR2A1-AS1',
'OR2AK2',
'OR2B11',
'OR2T2',
'OR52N4',
'OR56B1',
'OSBPL10',
'OXCT1-AS1',
'OXTR',
'P2RX5',
'P2RY1',
'P2RY13',
'PACRG',
'PALM',
'PAPPA-AS1',
'PCDH9',
'PCDHB4',
'PCDHGA3',
'PCSK1',
'PCSK6',
'PDF',
'PDK4-AS1',
'PEG10',
'PF4',
'PF4V1',
'PGAM2',
'PGM2L1',
'PHACTR1',
'PHACTR3',
'PIGY-DT',
'PITPNM2-AS1',
'PLCG2',
'PLEK',
'PLEKHA4',
'PLPP5',
'PLSCR2',
'PMAIP1',
'PNOC',
'POU3F3',
'PPBP',
'PPM1J',
'PPP1R12A-AS1',
'PPP1R14A',
'PPP1R14B-AS1',
'PPP1R17',
'PPP1R2C',
'PPP1R9A-AS1',
'PRAM1',
'PRECSIT',
'PRKD3-DT',
'PRLR',
'PROSER2',
'PRPF19-DT',
'PRR15',
'PRRT3-AS1',
'PRSS45P',
'PRUNE2',
'PTCH2',
'PTGDS',
'PTGER4',
'PTGES',
'PTK6',
'PTPN6',
'PTPRD-AS1',
'PYGL',
'RAB33B-AS1',
'RAB6C',
'RALGPS2',
'RARRES1',
'RARRES2',
'RASSF8',
'RBFADN',
'RBP7',
'REEP1',
'REG1A',
'RENO1',
'RGCC',
'RGL3',
'RGPD6',
'RGS1',
'RGS18',
'RGS2',
'RHBDF1',
'RHOB',
'RHOXF1-AS1',
'RIPOR1',
'RNASEH2B-AS1',
'RNF112',
'RNF182',
'RNF217-AS1',
'RNGTT',
'ROBO3',
'RP9',
'RPGR',
'RPRML',
'RRAD',
'RRS1-DT',
'RSPH1-DT',
'RXFP1',
'S100A10',
'S100A11',
'S100A16',
'S100A4',
'S100A5',
'S100A8',
'S100A9',
'SAMD9L',
'SAMSN1',
'SAT1',
'SCARF2',
'SCIMP',
'SCN8A',
'SCPEP1',
'SCT',
'SCUBE3-AS1',
'SDC2',
'SEMA3C',
'SERPINB2',
'SERPINE3',
'SERTAD2',
'SETBP1',
'SH3RF3',
'SIRPB1',
'SKAP1',
'SLAMF1',
'SLAMF7',
'SLC10A5',
'SLC12A3',
'SLC17A7',
'SLC22A1',
'SLC22A13',
'SLC25A18',
'SLC25A37',
'SLC26A8',
'SLC28A2',
'SLC2A4',
'SLC39A10',
'SLC3A2',
'SLC4A3',
'SLC5A5',
'SLC6A3',
'SLFN5',
'SLITRK5',
'SMIM10L2B-AS1',
'SMIM17',
'SNX22',
'SNX29',
'SOX4',
'SPDYE21',
'SPIRE2',
'SPX',
'SQSTM1',
'SRGAP3',
'SRRM5',
'ST8SIA5',
'STAG1-DT',
'STAP2',
'STAR',
'STARD6',
'STON2',
'STRC',
'STX1B',
'TAF1L',
'TAL1',
'TAPT1',
'TAS2R19',
'TAS2R39',
'TAT-AS1',
'TBC1D5',
'TCF4',
'TCF7L1',
'TCL1A',
'TDRD1',
'TDRP',
'TECTA',
'TEKTIP1',
'TESC',
'TEX45',
'TFAP2A',
'TFPI2',
'TGM1',
'THBS1',
'THTPA',
'TIMP3',
'TKTL2',
'TLE1',
'TM4SF1',
'TMEM107',
'TMEM176A',
'TMEM255A',
'TMEM72-AS1',
'TMPRSS3',
'TMSB10',
'TNFRSF13B',
'TNFRSF18',
'TNNC1',
'TNS2',
'TOB1',
'TP53INP1',
'TP53RK-DT',
'TRAV29DV5',
'TRBC1',
'TRBJ2-4',
'TRERNA1',
'TRIM52-AS1',
'TRIM72',
'TRMT9B',
'TRPM1',
'TSC22D3',
'TSPAN13',
'TTC28-AS1',
'TTLL10-AS1',
'TTLL7',
'TTN',
'TUBA1A',
'TUBA8',
'TVP23A',
'TWIST2',
'TXNIP',
'TYMP',
'TYROBP',
'UBE2Q2P16',
'UBL4B',
'UCP2',
'UNC79',
'UPK3A',
'USP43',
'VAV3',
'VIM',
'VPREB3',
'VSIG2',
'VSIR',
'VSTM1',
'VSTM4',
'VWA5B1',
'VWA7',
'WAC-AS1',
'WBP4',
'WNT3A',
'WNT5B',
'XAF1',
'XCL2',
'XG',
'YBX3',
'ZBTB18',
'ZBTB20',
'ZC2HC1B',
'ZC3H12C',
'ZCCHC9',
'ZDHHC11B',
'ZDHHC17',
'ZFP28-DT',
'ZFP36',
'ZFP36L2',
'ZMYND12',
'ZMYND8',
'ZNF318',
'ZNF32-AS2',
'ZNF320',
'ZNF331',
'ZNF462',
'ZNF473CR',
'ZNF474',
'ZNF491',
'ZNF497-AS1',
'ZNF503',
'ZNF521',
'ZNF578',
'ZNF665',
'ZNRF3-AS1',
'ZSCAN10',
'ZSCAN2-AS1',
'ZSCAN23'},
{'PDZK1IP1', 'TFAP2E', 'ZNF697'},
{'LRRIQ3'},
{'ACTL7B',
'AMPD1',
'ATP8A1-DT',
'DEPTOR-AS1',
'EDNRB-AS1',
'H3C12',
'HEATR4',
'IGKV1D-13',
'IGKV2D-30',
'LINC01823',
'LINC01990',
'MIPOL1',
'MIR130AHG',
'OR6C6',
'SLC16A14',
'SLC1A2-AS1',
'SLC49A3',
'SLC4A9',
'TGFB2-AS1',
'TUBB2B'},
{'XCL1'},
{'C1orf74'},
{'EXO1'},
{'ARNILA'},
{'DQX1', 'HTR1F', 'SERTAD3-AS1', 'STXBP6', 'ZNF98'},
{'LINC01857'},
{'MAP2'},
{'CHL1-AS2'},
{'EFHB'},
{'CTDSPL'},
{'CDCA5', 'KRBOX1'},
{'DNASE1L3'},
{'CFAP20DC', 'MIF-AS1'},
{'DPPA4'},
{'SOX2-OT'},
{'LPP-AS2'},
{'OSTN'},
{'ATP13A4'},
{'SMIM14-DT'},
{'TSPAN5-DT'},
{'IL2'},
{'FGF2'},
{'AGA-DT'},
{'RAB3C'},
{'S100Z'},
{'CBY3', 'CYP2C8', 'DMXL1-DT', 'ECI1-AS1', 'LINC01301', 'PELP1-DT', 'PTGR1'},
{'ALDH7A1'},
{'KCTD16'},
{'CAGE1'},
{'H2BC13'},
{'HLA-DRB5'},
{'HLA-DQA2'},
{'LINC01013'},
{'TRBV14'},
{'IGHV3-49', 'TRBV28'},
{'OSR2', 'TONSL'},
{'CCDC87', 'FOXH1', 'GPHA2', 'HTRA1'},
{'ZNF252P-AS1'},
{'STOX1'},
{'TNKS2-DT'},
{'KCNIP2'},
{'HBA1', 'HBA2', 'HBB'},
{'IRAG1-AS1'},
{'BDNF-AS'},
{'TMEM123-DT'},
{'BACE1-AS'},
{'PHLDA1-DT'},
{'IGHV1-3'},
{'IGHV3-30'},
{'IGHV4-39'},
{'IGHV5-51'},
{'IGHV3-53'},
{'PIF1'},
{'ANKRD40CL'},
{'SMIM5'},
{'NPTX1'},
{'MTCL1'},
{'NETO1-DT'},
{'RASGRP4'},
{'ZNF233'},
{'STK4-DT'},
{'AATBC'},
{'S100B'},
{'IGLV3-19'},
{'IGLV2-14'},
{'SERHL2'},
{'KLHDC7B-DT'},
{'CLCN4'},
{'XIST'},
{'DDX3Y', 'EIF1AY'}]
# Initialise a new graph for community visualisation
community_graph2 = nx.Graph()
# Add edges between communities based on their connectivity
for i, community in enumerate(communities2):
for node in community:
community_graph2.add_node(node)
for neighbor in G2.neighbors(node):
if neighbor in community:
community_graph2.add_edge(node, neighbor, weight=G2[node][neighbor]['weight'])
# Initialise an empty list to store separated communities
separated_communities = []
# Iterate over the detected communities
for community in communities:
# Convert the set of nodes into a list and append it to the list of lists
separated_communities.append(list(community))
# Print or use the separated communities as needed
print(separated_communities)
[['NKX6-3', 'ZC2HC1B', 'ZSCAN10', 'HCG9', 'CC2D2B', 'DBX2', 'MOBP', 'STAP2', 'LCN2', 'ZNF462', 'TMEM72-AS1', 'LNCOC1', 'SLC28A2', 'SLC10A5', 'SRRM5', 'SLC17A7', 'OPA1-AS1', 'DEPDC7', 'LIN28A', 'SLC6A3', 'GJB6', 'TMEM255A', 'IGLC4', 'LINC00640', 'DENND6A-AS1', 'FNDC5', 'LAMA2', 'CYYR1', 'CNKSR3', 'RNF182', 'TECTA', 'KRT2', 'PRRT3-AS1', 'LINC02267', 'MUCL3', 'CFAP99', 'CCNA1', 'SLC26A8', 'GNG12', 'PHACTR3', 'LINC01133', 'C21orf62', 'ANKRD29', 'NGFR', 'NKD2', 'GJC2', 'LIM2', 'ANO1', 'LINC00997', 'STARD6', 'ARHGAP23', 'HYDIN', 'KLHL13', 'IGLV5-37', 'PLSCR2', 'TDRP', 'REG1A', 'LINC01832', 'ARHGEF39', 'CDH6', 'LINC01891', 'MGAM', 'AKAP14', 'TDRD1', 'PRLR', 'RARRES2', 'SLC2A4', 'OR2B11', 'DTNA', 'PTGES'], ['ACTN3', 'PDK4-AS1', 'C10orf105', 'ACVR2B-AS1', 'LINC02348', 'DPYD-IT1', 'UBE2Q2P16', 'PRKD3-DT', 'LINC01986', 'CALML6', 'EPHB2', 'PAPPA-AS1', 'DLGAP2', 'MROH8', 'RAB6C', 'ZNF503', 'H3C8', 'LINC02569', 'CDC42EP1', 'TFAP2A', 'ESCO2', 'WNT3A', 'SLC12A3', 'RGPD6', 'ICA1-AS1', 'FOXD3', 'LINC02615', 'FAM174A-DT'], ['IGHV3-32', 'LINC01050', 'STON2', 'CLGN', 'CCK', 'RGL3', 'COPDA1', 'PPP1R9A-AS1', 'SPDYE21', 'BSN', 'PRR15', 'IGKV3D-11', 'C12orf71', 'SMIM17', 'FAM20C', 'KCNQ4', 'PGAM2', 'DAGLA', 'NKX6-2', 'REEP1', 'LGALSL-DT', 'ZNF32-AS2', 'ZSCAN2-AS1', 'TWIST2', 'LINC01503', 'CADM4', 'DBH-AS1'], ['MAG', 'MOCS1', 'ALDH8A1', 'MTOR-AS1', 'MEX3B', 'MYO3B', 'GLP2R', 'SCT', 'TAF1L', 'IGHV3-73', 'OR2AK2', 'PTPRD-AS1', 'ST8SIA5', 'PITPNM2-AS1', 'EPHA2', 'TKTL2', 'SLC5A5', 'PCSK1', 'CUX2', 'IGHV3-69-1', 'ZNF474'], ['LINC02880', 'COL11A2', 'SCARF2', 'ADGRG3', 'ZNF491', 'CFAP141', 'FSTL1', 'TRBJ2-4', 'SMIM10L2B-AS1', 'UBL4B', 'CLEC1B', 'DPYSL4', 'LDHAL6A', 'PRUNE2', 'LINC00200', 'BRME1', 'FAM186B', 'VWA5B1', 'RARRES1', 'LINC01108'], ['SLC1A2-AS1', 'MIR130AHG', 'HEATR4', 'SLC49A3', 'OR6C6', 'SLC4A9', 'IGKV1D-13', 'ACTL7B', 'LINC01823', 'MIPOL1', 'AMPD1', 'IGKV2D-30', 'SLC16A14', 'DEPTOR-AS1', 'EDNRB-AS1', 'H3C12', 'LINC01990', 'ATP8A1-DT', 'TGFB2-AS1'], ['NLRP6', 'PRSS45P', 'NAALADL2', 'VSTM1', 'MYLK3', 'FOXI1', 'PCDHGA3', 'VWA7', 'HDHD5-AS1', 'LINC03021_ENSG00000254319', 'LINC01579', 'IGHV1-14', 'HCG22', 'LINC02055', 'UNC79', 'RNF217-AS1', 'C1orf50-AS1', 'GYS2'], ['CASC9', 'CCDC28A-AS1', 'TRPM1', 'CYP2S1', 'TMPRSS3', 'THTPA', 'GCNT3', 'FREY1', 'RHOXF1-AS1', 'NHS', 'IGHV7-4-1', 'SLITRK5', 'IGLV5-45', 'TRMT9B', 'CFAP57', 'RNF112', 'LINC00654'], ['ASIC3', 'SLC22A1', 'BHLHE22', 'LIF', 'S100A16', 'CT69', 'TRAV29DV5', 'C10orf71', 'TTLL7', 'MS4A4E', 'DNMBP-AS1', 'EDAR'], ['FAM182A', 'PACRG', 'NKX6-1', 'ACTA2-AS1', 'TAS2R39', 'CHRNG', 'LINC02660', 'CDH23'], ['TEKTIP1', 'ASGR2', 'IGKV2-26', 'LINC02057', 'GOLGA8H', 'LRRC49'], ['CYP2C8', 'PELP1-DT', 'LINC01301', 'CBY3', 'PTGR1'], ['KCNH6', 'LINC01980', 'DLG1-AS1', 'IQSEC2', 'LPAR4'], ['MYO3A', 'HSD17B2-AS1', 'DUOX2', 'LINC01115_ENSG00000237667'], ['FCN3', 'PCDHB4', 'S100A5'], ['USP43', 'IGFBP2', 'CNTNAP3'], ['NPM2', 'RHBDF1'], ['STXBP6', 'SERTAD3-AS1']]
len(communities2)
74
# Choose the index of the community you want to visualize
community_index2 = 0 # Change this to the index of the community you want to visualise
# Get the nodes in the selected community
selected_community2 = list(communities2[community_index2])
# Create a subgraph containing only the nodes and edges within the selected community
subgraph2 = G2.subgraph(selected_community2)
# Assuming subgraph is already defined
num_nodes = subgraph.number_of_nodes()
print(f"The number of nodes in the subgraph is: {num_nodes}")
The number of nodes in the subgraph is: 70
#Step 7:
# Visualise the subgraph
pos = nx.spring_layout(subgraph2, k=0.8, iterations=20) # You can use different layout algorithms if needed
plt.figure(figsize=(14, 12))
nx.draw(subgraph2, pos, with_labels=True, node_color='skyblue', node_size=100, edge_color='gray', linewidths=0.1, font_size=8)
plt.title('Community Visualisation')
plt.show()